import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Load the pre-trained model
model = SentenceTransformer('efederici/sentence-bert-base')

# Read the CSV files into DataFrames
df1 = pd.read_csv('corpus_docx.tidy.csv')
df2 = pd.read_csv('corpus_pdf.tidy.csv')

doc_ids = df1['doc_id'].unique()

results = []
results_full_text = []

# Group and compare sentences based on doc_id
for doc_id in doc_ids:
  
  print(f"Document ID: {doc_id}")
  
  group1 = df1[df1['doc_id'] == doc_id]
  group2 = df2[df2['doc_id'] == doc_id]
  
  sentences1 = group1['sentence'].tolist()
  
  if len(sentences1) == 0:
    
    continue
  
  sentences2 = group2['sentence'].tolist()
  
  if len(sentences2) == 0:
    
    continue

  # Encode the sentences as embeddings
  embedding1 = model.encode(sentences1, convert_to_tensor=True)
  embedding2 = model.encode(sentences2, convert_to_tensor=True)

  # Compare the embeddings
  similarity_scores = util.pytorch_cos_sim(embedding1, embedding2)

  # Print the similarity scores
  for i, sentence1 in enumerate(sentences1):
    for j, sentence2 in enumerate(sentences2):
      score = similarity_scores[i][j]
      print(f"Similarity between '{sentence1}' and '{sentence2}': {score}")
      print()  # Print an empty line between sentences
      results.append({'doc_id': doc_id, 'i': i, 'j': j, 'score': float(score)})
      results_full_text.append({'doc_id': doc_id, 'sentence1': sentence1, 'sentence2': sentence2, 'score': float(score)})
      
# Create a DataFrame from the results list
results_df = pd.DataFrame(results)
results_full_text_df = pd.DataFrame(results_full_text)

# Save the results DataFrame to a new CSV file
results_df.to_csv('cos_sim_results.csv', index=False)
results_full_text_df.to_csv('cos_sim_results_full_text.csv', index=False)

